In [220]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
In [278]:
def draw_scatter_plot(df, col_name):
np_array = df[col_name].values
plt.figure(figsize=(8,6))
plt.scatter(range(len(np_array)), np.sort(np_array))
plt.xlabel('index', fontsize=12)
plt.ylabel(col_name, fontsize=12)
plt.show()
def draw_dist_plot(df, col_name):
np_array = df[col_name].values
plt.figure(figsize=(12,8))
sns.distplot(np_array, bins=50, kde=False)
plt.xlabel(col_name, fontsize=12)
plt.ylabel('count', fontsize=12)
plt.show()
def draw_count_plot(df, col_name, title='plot'):
plt.figure(figsize=(12,8))
sns.countplot(data=df, x=col_name)
plt.xticks(rotation='vertical')
plt.xlabel(col_name, fontsize=12)
plt.ylabel('count', fontsize=12)
plt.title(title, fontsize=15)
plt.show()
def draw_box_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
sns.boxplot(data=df, x=x_col, y=y_col)
plt.xlabel(x_col, fontsize=12)
plt.ylabel(y_col, fontsize=12)
plt.show()
def draw_violin_plot(df, x_col, y_col):
plt.figure(figsize=(12,8))
sns.violinplot(data=df, x=x_col, y=y_col)
plt.xlabel(x_col, fontsize=12)
plt.ylabel(y_col, fontsize=12)
plt.show()
def draw_plots(df, col_name):
draw_scatter_plot(df, col_name)
draw_dist_plot(df, col_name)
def draw_np_array_scatter_plot(np_array, col_name):
plt.figure(figsize=(8,6))
plt.scatter(range(len(np_array)), np.sort(np_array))
plt.xlabel('index', fontsize=12)
plt.ylabel(col_name, fontsize=12)
plt.show()
def draw_np_array_dist_plot(np_array, col_name):
plt.figure(figsize=(12,8))
sns.distplot(np_array, bins=50, kde=False)
plt.xlabel(col_name, fontsize=12)
plt.ylabel('count', fontsize=12)
plt.show()
def convert_outlier_value(df, col_name, upper_percentile=99.0, lower_percentile=1.0):
np_array = df[col_name].values
ulimit = np.percentile(np_array, upper_percentile)
llimit = np.percentile(np_array, lower_percentile)
print('upper limit :', ulimit, ', lower limit :', llimit)
# convert
df[col_name].loc[df[col_name] > ulimit] = ulimit
df[col_name].loc[df[col_name] < llimit] = llimit
In [222]:
from subprocess import check_output
print(check_output(['ls', 'input']).decode('utf8'))
In [223]:
train_df = pd.read_csv('input/train_2016_v2.csv', parse_dates=['transactiondate'])
print('train_df.shape :', train_df.shape)
display(train_df.head())
In [224]:
draw_plots(train_df, 'logerror')
In [225]:
convert_outlier_value(train_df, 'logerror')
In [226]:
draw_plots(train_df, 'logerror')
In [227]:
train_df['transaction_month'] = train_df['transactiondate'].dt.month
display(train_df.head())
In [228]:
transactiondate_cnt_srs = train_df['transactiondate'].value_counts()
plt.figure(figsize=(8,6))
plt.scatter(range(len(transactiondate_cnt_srs)), np.sort(transactiondate_cnt_srs))
plt.xlabel('transactiondate', fontsize=12)
plt.ylabel('counts', fontsize=12)
plt.show()
In [229]:
cnt_srs = train_df['transaction_month'].value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8, color=color[3])
plt.xlabel('Month of transaction', fontsize=12)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.show()
In [230]:
(train_df['parcelid'].value_counts().reset_index())['parcelid'].value_counts()
Out[230]:
In [231]:
prop_df = pd.read_csv('input/properties_2016.csv', low_memory=False)
print('properties 2016 shape :', prop_df.shape)
display(prop_df.head())
In [232]:
missing_df = prop_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count'] > 0]
missing_df = missing_df.sort_values(by='missing_count')
display(missing_df)
In [233]:
ind = np.arange(missing_df.shape[0])
fig, ax = plt.subplots(figsize=(12,18))
rects = ax.barh(ind, missing_df.missing_count.values, color='blue')
ax.set_yticklabels(missing_df.column_name.values)
plt.yticks(ind)
plt.xlabel('Count of missing values')
plt.title('Number of missing values is each column')
plt.show()
In [234]:
plt.figure(figsize=(12,12))
sns.jointplot('latitude', 'longitude', data=prop_df, size=10)
plt.show()
In [235]:
train_df = pd.merge(train_df, prop_df, on='parcelid', how='left')
train_df.head()
Out[235]:
In [236]:
pd.options.display.max_rows = 65
dtype_df = train_df.dtypes.reset_index()
dtype_df.columns = ['Count', 'Column Type']
dtype_df
Out[236]:
In [237]:
dtype_df.groupby('Column Type').aggregate('count').reset_index()
Out[237]:
In [238]:
missing_df = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / train_df.shape[0]
missing_df.loc[missing_df['missing_ratio'] > 0.99]
Out[238]:
In [239]:
mean_values = train_df.mean(axis=0)
train_df_new = train_df.fillna(mean_values)
In [240]:
missing_df = train_df_new.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / train_df_new.shape[0]
missing_df.loc[missing_df['missing_ratio'] > 0]['column_name']
Out[240]:
In [241]:
print(train_df_new['taxdelinquencyflag'].value_counts())
train_df_new['taxdelinquencyflag'] = train_df_new['taxdelinquencyflag'] \
.apply(lambda x: True if x == True or x == 'Y' else False)
print(train_df_new['taxdelinquencyflag'].value_counts())
In [242]:
train_df_new = train_df_new.fillna('UNKNOWN')
In [243]:
missing_df = train_df_new.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df['missing_ratio'] = missing_df['missing_count'] / train_df_new.shape[0]
missing_df.loc[missing_df['missing_ratio'] > 0]['column_name']
Out[243]:
In [244]:
x_cols = [col
for col in train_df_new.columns
if col not in ['logerror']
if train_df_new[col].dtype=='float64']
labels = []
values = []
for col in x_cols:
if np.std(train_df_new[col]) == 0.0:
print('std is 0', col)
else:
labels.append(col)
values.append(np.corrcoef(train_df_new[col].values, train_df_new.logerror.values)[0,1])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')
In [245]:
ind = np.arange(len(labels))
fig, ax = plt.subplots(figsize=(12, 40))
rects = ax.barh(ind, np.array(corr_df['corr_values'].values), color='y')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df['col_labels'].values, rotation='horizontal')
plt.xlabel('Correlation coefficient')
plt.title('Correlation coefficient of the variables')
plt.show()
In [246]:
corr_df_sel = corr_df.loc[(corr_df['corr_values'] > 0.02)
| (corr_df['corr_values'] < -0.01)]
display(corr_df_sel)
In [247]:
cols_to_use = corr_df_sel['col_labels'].tolist()
temp_df = train_df_new[cols_to_use]
corrmat = temp_df.corr(method='spearman')
f, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(corrmat, vmax=1., square=True, cmap='coolwarm')
plt.title('Important variables correlation map', fontsize=15)
plt.show()
In [248]:
col = 'finishedsquarefeet12'
In [249]:
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)
Out[249]:
In [250]:
convert_outlier_value(train_df_new, col, lower_percentile=0.5, upper_percentile=99.5)
In [251]:
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)
Out[251]:
In [252]:
col = "calculatedfinishedsquarefeet"
In [253]:
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)
Out[253]:
In [254]:
convert_outlier_value(train_df_new, col, lower_percentile=0.5, upper_percentile=99.5)
In [255]:
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)
Out[255]:
In [274]:
col = 'bathroomcnt'
In [275]:
draw_count_plot(train_df_new, col)
draw_box_plot(train_df_new, col, 'logerror')
In [276]:
col = 'bedroomcnt'
In [279]:
draw_count_plot(train_df_new, col)
draw_box_plot(train_df_new, col, 'logerror')
draw_violin_plot(train_df_new, col, 'logerror')
In [280]:
col = "taxamount"
In [282]:
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)
Out[282]:
In [283]:
convert_outlier_value(train_df_new, col, lower_percentile=0.5, upper_percentile=99.5)
In [284]:
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)
Out[284]:
In [285]:
col = 'yearbuilt'
In [286]:
draw_scatter_plot(train_df_new, col)
sns.jointplot(col, 'logerror', data=train_df_new, size=10)
Out[286]:
In [292]:
from ggplot import *
ggplot(aes(x='yearbuilt', y='logerror'), data=train_df_new) + \
geom_point(color='steelblue', size=1)
Out[292]:
In [294]:
ggplot(aes(x='latitude', y='longitude', color='logerror'), data=train_df_new) + \
geom_point() + \
scale_color_gradient(low = 'red', high = 'blue')
Out[294]:
In [296]:
ggplot(aes(x='finishedsquarefeet12', y='taxamount', color='logerror'), data=train_df_new) + \
geom_point(alpha=0.7) + \
scale_color_gradient(low = 'pink', high = 'blue')
Out[296]:
In [303]:
train_y = train_df_new['logerror'].values
drop_cols = ["hashottuborspa", "propertycountylandusecode", "propertyzoningdesc", "fireplaceflag", "taxdelinquencyflag",
'parcelid', 'logerror', 'transactiondate', 'transaction_month']
numeric_train_df = train_df_new.drop(drop_cols, axis=1)
feat_names = numeric_train_df.columns.values
In [308]:
from sklearn import ensemble
model = ensemble.ExtraTreesRegressor(n_estimators=25, max_depth=30, max_features=0.3,
n_jobs=-1, random_state=0)
model.fit(numeric_train_df, train_y)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]
In [309]:
plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()
In [312]:
import xgboost as xgb
xgb_params = {
'eta': 0.05,
'max_depth': 8,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'silent': 1,
'seed' : 0
}
dtrain = xgb.DMatrix(numeric_train_df, train_y, feature_names=feat_names)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=50)
# plot the important features #
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, height=0.8, ax=ax)
plt.show()
In [259]:
print('hi')